import re
import requests
from lxml import etree

baidu_url = "https://news.baidu.com/"

header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
response = requests.get(baidu_url, headers=header)
data = response.content.decode('utf-8')


#1.转解析类型
xpath_data = etree.HTML(data)

#节点 /
#跨节点 //
#精确的标签：//a[@属性="属性值"]
#标签包裹的内容:text()
#属性:@href
#xpath返回的数据类型是list
#下标从1开始  只能取平级关系的标签
#模糊查询 //div[contains(@id,"normalthread")]
#下一个节点(平级关系) following-sibling::*



#2.调用xpath的方法
# result = xpath_data.xpath('//a/text()')
# result = xpath_data.xpath('//a[@mon="ct=1&a=1&c=top&pn=1"]/text()')
# result = xpath_data.xpath('//a[@mon="ct=1&a=1&c=top&pn=1"]/@href')
result = xpath_data.xpath('//li/a/text()')
print(result)


# with open('02.html',mode='w',encoding='utf-8') as file:
#     file.write(data)
